library(ggplot2)
library(data.table)
library(scales)
library(RColorBrewer)
library(reticulate)
library(uwot)
## Loading required package: Matrix
library(gridExtra)
library(spatstat)
## Loading required package: spatstat.data
## Loading required package: nlme
## Loading required package: rpart
##
## spatstat 1.63-3 (nickname: 'Wet paint')
## For an introduction to spatstat, type 'beginner'
##
## Note: R version 3.6.0 (2019-04-26) is more than a year old; we strongly recommend upgrading to the latest version
##
## Attaching package: 'spatstat'
## The following object is masked from 'package:scales':
##
## rescale
## The following object is masked from 'package:data.table':
##
## shift
library(ggdendro)
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
library(tidyverse)
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat
## ── Attaching packages ──────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.3
## ── Conflicts ─────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between() masks data.table::between()
## x readr::col_factor() masks scales::col_factor()
## x dplyr::collapse() masks nlme::collapse()
## x dplyr::combine() masks gridExtra::combine()
## x purrr::discard() masks scales::discard()
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x tidyr::pack() masks Matrix::pack()
## x purrr::transpose() masks data.table::transpose()
## x tidyr::unpack() masks Matrix::unpack()
library(grid)
library(gridExtra) # add to AWS
use_condaenv(condaenv = 'r-reticulate', required = TRUE)
data_dir <- here::here('..','data')
load(file.path(data_dir, 'diff.sampled.Rdata'))
diff_dt <- fread(file.path(data_dir, 'diff.sampled.csv.gz'))
censor_negative_min = -1500
redo_umap_clustering = F
redo_umap_param_search = F
#only redo param search if doing clustering also:
redo_umap_clustering = redo_umap_clustering && redo_umap_clustering
# map day 0 to both plus and minus
map_day_0 <- function(df) {
return(rbind(
df[k562!='none'],
df[k562=='none'][, k562 := 'cd19+'],
df[k562=='none'][, k562 := 'cd19-']
))
}
For now, skip straight to high dimensional plotting.
#grab channel map from cd4 data (cd4/8 are the same)
channel_map <- diff_opt_cd4$channel_map
# for now use all variables in the channel map except cd4/8
# removing zombie, cd_t, myc_t, gfp_t
umap_vars <- c(paste0(names(channel_map),'_t'))
umap_vars <- umap_vars[umap_vars != c('cd_t', 'zombie_t', 'myc_t', 'gfp_t')]
## Warning in umap_vars != c("cd_t", "zombie_t", "myc_t", "gfp_t"): longer object
## length is not a multiple of shorter object length
sample_for_umap <- function(df, sample_size) {
umap_dt <- diff_dt[!is.na(event_id),
.SD[event_id %in% sample(unique(event_id),
min(sample_size, length(unique(event_id))))],
by=c('well', 'plate', 'day')]
}
cast_for_umap <- function(df) {
#cast it so variables are columns and
#subset sample umap data on variables
umap_cast_dt <- dcast(df,
event_id + well + plate + day + t_type ~ variable,
value.var='value')
return(umap_cast_dt)
}
scale_for_umap <- function(df, umap_vars, censor_negative_min) {
umap_dt_in <- df[, ..umap_vars]
# for points that are very negative, trim the to below the cutoff
umap_dt_in[umap_dt_in < censor_negative_min] <- censor_negative_min
# scale each input column
umap_dt_in[ ,
(names(umap_dt_in)) := lapply(.SD, scale), .SDcols = names(umap_dt_in)]
return(umap_dt_in)
}
# check scales:
# ggplot(melt(cbind(umap_dt_in, id=1:nrow(umap_dt_in)), id.var='id'),
# aes(x=value)) +
# geom_density() +
# facet_grid(variable~.) +
# scale_fill_distiller(palette='RdYlBu') +
# theme_bw()
First, find parameters with a small sample.
# use umap canned functions
umap_dt <- sample_for_umap(diff_dt, 20)
umap_cast_dt <- cast_for_umap(umap_dt)
umap_dt_in <- scale_for_umap(umap_cast_dt, umap_vars, censor_negative_min)
### Run across parameter list
#umap parameter list
min_dist_set <- c(0.0001, 0.005, 0.1)
n_neighbor_set <- c(3,6,15,30)
learning_rate_set <- c(0.1, 0.25, 0.5, 1)
umap_params <- data.table(expand.grid(min_dist_set, n_neighbor_set, learning_rate_set))
# run umap via uwot library
umap_out <- umap_params[, {
print(paste0('Running: ',.BY));
as.data.table(umap(umap_dt_in, min_dist=as.numeric(.BY[1]),
n_neighbors=as.numeric(.BY[2]), learning_rate=as.numeric(.BY[3]), verbose=F, n_threads=8, n_trees=100))
}, by=names(umap_params)]
# rename the columns
names(umap_out) <- c('min_dist','n_neighbor', 'learning_rate', 'umap_1','umap_2')
# add the umap output to the input dt
umap_fcs_dt <- cbind(umap_cast_dt[, 1:length(names(umap_cast_dt))], umap_out)
umap_fcs_dt[, id := 1:.N]
umap_fcs_dt <- unique(umap_dt[,
.(donor, car, k562, t_type, day, well, event_id)])[
umap_fcs_dt, on=.(t_type,well,day,event_id)]
color_points <- ggplot(umap_fcs_dt[car %in% c('CD28','41BB','KLRG1','BAFF-R','Zeta')],
aes(x=umap_1, y=umap_2, color=interaction(car, t_type))) +
geom_point(size=0.1, alpha=0.1) +
guides(colour = guide_legend(override.aes = list(alpha=1, size=3))) +
facet_grid(min_dist~n_neighbor+learning_rate) +
theme_bw()
color_density <- ggplot(umap_fcs_dt, aes(x=umap_1, y=umap_2)) +
geom_hex(bins = 70) +
scale_fill_continuous(
type = "viridis", limits=c(0,30), oob=scales::squish) +
facet_grid(min_dist~n_neighbor+learning_rate) +
theme_bw()
grid.arrange(color_points, color_density, ncol=2)
Choosing neighbors == 30 and min_dist == 1e-4.
Scaling this to 200 events per well and checking CAR/condition separation.
chosen_dist=0.0001
chosen_n_neighbor=100 # default is 15
chosen_learning_rate=0.1
umap_dt <- diff_dt[!is.na(event_id),
.SD[event_id %in% sample(unique(event_id),
min(1000, length(unique(event_id))))], #modify minimum parameter for points per well
by=c('well', 'plate', 'day')]
#grab channel map from cd4 data (cd4/8 are the same)
channel_map <- diff_opt_cd4$channel_map
# for now use all variables in the channel map
umap_vars <- c(paste0(names(channel_map),'_t'))
# removing cd_t, zombie_t, myc_t, gfp_t from umap_vars
umap_vars <- umap_vars[!umap_vars %in% c('zombie_t', 'myc_t', 'gfp_t')]
# filter by cd4/8
umap_cd4_dt <- umap_dt %>% filter(t_type=='cd4')
umap_cd8_dt <- umap_dt %>% filter(t_type=='cd8')
umap_cd4_cast_dt <- dcast(umap_cd4_dt,
event_id + well + plate + day + t_type ~ variable,
value.var='value')
umap_cd4_dt_in <- umap_cd4_cast_dt[, umap_vars]
# scale each input column
umap_cd4_dt_in <- scale(umap_cd4_dt_in)
# run umap via uwot library (added learning rate and n_sgd_threads=1)
set.seed(123)
umap_cd4_out <- umap(umap_cd4_dt_in, min_dist=chosen_dist,
n_neighbors=chosen_n_neighbor, learning_rate=chosen_learning_rate, verbose=T, n_threads=8, n_trees=100, n_sgd_threads=1,
ret_nn=T)
# nearest neighbors in original space
nearest_neighbors_cd4 <- umap_cd4_out$nn$euclidean$idx
neighbor_dist_cd4 <- umap_cd4_out$nn$euclidean$dist
edge_weights_cd4 <- scales::rescale(-neighbor_dist_cd4, to=c(0,1))
edge_weights_cd4 <- edge_weights_cd4 - min(edge_weights_cd4)
umap_cd4_out <- data.table(umap_cd4_out$embedding)
#nearest neighbors in embedding
umap_nn <- cbind(1:nrow(umap_fcs_dt),
nnwhich(umap_fcs_dt[, list(umap_1, umap_2)], k=c(1:3)))
umap_nd <- cbind(rep(0, nrow(umap_fcs_dt)),
nndist(umap_fcs_dt[, list(umap_1, umap_2)], k=c(1:3)))
umap_ew <- scales::rescale(-umap_nd, to=c(0,1))
umap_ew <- umap_ew - min(umap_ew)
# rename the columns
names(umap_cd4_out) <- c('umap_1','umap_2')
# add the umap output to the input dt
umap_cd4_fcs_dt <- cbind(umap_cd4_cast_dt[, 1:length(names(umap_cd4_cast_dt))], umap_cd4_out)
umap_cd4_fcs_dt <- tibble::rowid_to_column(umap_cd4_fcs_dt, "id")
# add back the annotations
umap_cd4_fcs_dt <- umap_cd4_fcs_dt %>% left_join(distinct(umap_cd4_dt, donor, car, k562, t_type, day, well, event_id), by=c('t_type', 'day', 'well', 'event_id'))
umap_cd8_cast_dt <- dcast(umap_cd8_dt,
event_id + well + plate + day + t_type ~ variable,
value.var='value')
umap_cd8_dt_in <- umap_cd8_cast_dt[, umap_vars]
# scale each input column
umap_cd8_dt_in <- scale(umap_cd8_dt_in)
# run umap via uwot library (added learning rate and n_sgd_threads=1)
set.seed(123)
umap_cd8_out <- umap(umap_cd8_dt_in, min_dist=chosen_dist,
n_neighbors=chosen_n_neighbor, learning_rate=chosen_learning_rate, verbose=T, n_threads=8, n_trees=100, n_sgd_threads=1,
ret_nn=T)
# nearest neighbors in original space
nearest_neighbors_cd8 <- umap_cd8_out$nn$euclidean$idx
neighbor_dist_cd8 <- umap_cd8_out$nn$euclidean$dist
edge_weights_cd8 <- scales::rescale(-neighbor_dist_cd8, to=c(0,1))
edge_weights_cd8 <- edge_weights_cd8 - min(edge_weights_cd8)
umap_cd8_out <- data.table(umap_cd8_out$embedding)
#nearest neighbors in embedding
umap_nn <- cbind(1:nrow(umap_fcs_dt),
nnwhich(umap_fcs_dt[, list(umap_1, umap_2)], k=c(1:3)))
umap_nd <- cbind(rep(0, nrow(umap_fcs_dt)),
nndist(umap_fcs_dt[, list(umap_1, umap_2)], k=c(1:3)))
umap_ew <- scales::rescale(-umap_nd, to=c(0,1))
umap_ew <- umap_ew - min(umap_ew)
# rename the columns
names(umap_cd8_out) <- c('umap_1','umap_2')
# add the umap output to the input dt
umap_cd8_fcs_dt <- cbind(umap_cd8_cast_dt[, 1:length(names(umap_cd8_cast_dt))], umap_cd8_out)
umap_cd8_fcs_dt <- tibble::rowid_to_column(umap_cd8_fcs_dt, "id")
# add back the annotations
umap_cd8_fcs_dt <- umap_cd8_fcs_dt %>% left_join(distinct(umap_cd8_dt, donor, car, k562, t_type, day, well, event_id), by=c('t_type', 'day', 'well', 'event_id'))
import leidenalg
import igraph as ig
import pandas as pd
import numpy as np
import math
# first, for CD4
edges_cd4 = []
n_neighbors_cd4 = r.nearest_neighbors_cd4.shape[1]
for i, nn_row in enumerate(r.nearest_neighbors_cd4):
edge_weights_cd4 = r.edge_weights_cd4[i]
for j in range(1, n_neighbors_cd4):
edges_cd4.append((int(nn_row[0]), int(nn_row[j]), edge_weights_cd4[j]))
knn_graph_cd4 = ig.Graph.TupleList(edges_cd4, directed=True, weights=True)
part_cd4 = leidenalg.find_partition(knn_graph_cd4,
leidenalg.ModularityVertexPartition, weights='weight')
#part = leidenalg.find_partition(knn_graph,
# leidenalg.CPMVertexPartition, weights='weight',
# resolution_parameter=0.05)
cluster_membership_cd4 = [x for _,x in sorted(zip(knn_graph_cd4.vs['name'],part_cd4.membership))]
# again for CD8
edges_cd8 = []
n_neighbors_cd8 = r.nearest_neighbors_cd8.shape[1]
for i, nn_row in enumerate(r.nearest_neighbors_cd8):
edge_weights_cd8 = r.edge_weights_cd8[i]
for j in range(1, n_neighbors_cd8):
edges_cd8.append((int(nn_row[0]), int(nn_row[j]), edge_weights_cd8[j]))
knn_graph_cd8 = ig.Graph.TupleList(edges_cd8, directed=True, weights=True)
part_cd8 = leidenalg.find_partition(knn_graph_cd8,
leidenalg.ModularityVertexPartition, weights='weight')
#part = leidenalg.find_partition(knn_graph,
# leidenalg.CPMVertexPartition, weights='weight',
# resolution_parameter=0.05)
cluster_membership_cd8 = [x for _,x in sorted(zip(knn_graph_cd8.vs['name'],part_cd8.membership))]
# convert into data tables
umap_cd4_fcs_dt <- as.data.table(umap_cd4_fcs_dt)
umap_cd8_fcs_dt <- as.data.table(umap_cd8_fcs_dt)
#add new column to umap
umap_cd4_fcs_dt[, cluster := factor(py$cluster_membership_cd4+1)]
umap_cd8_fcs_dt[, cluster := factor(py$cluster_membership_cd8+1)]
# save the clustering data
fwrite(umap_cd4_fcs_dt,
compress='gzip',
file=file.path(here::here('..','data','diff.sampled.umap_cd4.csv.gz')))
fwrite(umap_cd8_fcs_dt,
compress='gzip',
file=file.path(here::here('..','data','diff.sampled.umap_cd8.csv.gz')))
# sync output back to s3
system('aws s3 sync \\
--exclude "*" \\
--include "*.Rdata" \\
--include "*.csv*" \\
~/local-tcsl/data s3://roybal-tcsl//lenti_screen_compiled_data/data')
One issue with these clusters is that they are not contiguous in the UMAP embedding: ### cd8 umap plot
umap_cd8_fcs_dt <- fread(
file.path(
here::here('..','data','diff.sampled.umap_cd8.csv.gz')))
umap_cd8_fcs_dt[, cluster := factor(cluster)]
umap_cd8_cluster_dt <- umap_cd8_fcs_dt[, list(
mean_umap_1= mean(umap_1),
mean_umap_2= mean(umap_2),
size= .N), by=cluster]
# whole plot
ggplot() +
geom_point(data=umap_cd8_fcs_dt,
aes(x=umap_1, y=umap_2, color=cluster), size=0.2, alpha=0.2) +
geom_label(data=umap_cd8_cluster_dt, aes(
x=mean_umap_1, y=mean_umap_2,
label=paste(cluster,size,sep='\n'),
color=cluster), alpha=0.3) +
theme_minimal()
# individual plots
ggplot() +
geom_point(data=umap_cd8_fcs_dt,
aes(x=umap_1, y=umap_2, color=cluster), size=0.2, alpha=0.2) +
geom_label(data=umap_cd8_cluster_dt, aes(
x=mean_umap_1, y=mean_umap_2,
label=paste(cluster,size,sep='\n'),
color=cluster), alpha=0.3) +
facet_wrap(~cluster) +
theme_bw()
### cd4 umap plot
umap_cd4_fcs_dt <- fread(
file.path(
here::here('..','data','diff.sampled.umap_cd4.csv.gz')))
umap_cd4_fcs_dt[, cluster := factor(cluster)]
umap_cd4_cluster_dt <- umap_cd4_fcs_dt[, list(
mean_umap_1= mean(umap_1),
mean_umap_2= mean(umap_2),
size= .N), by=cluster]
# whole plot
ggplot() +
geom_point(data=umap_cd4_fcs_dt,
aes(x=umap_1, y=umap_2, color=cluster), size=0.2, alpha=0.2) +
geom_label(data=umap_cd4_cluster_dt, aes(
x=mean_umap_1, y=mean_umap_2,
label=paste(cluster,size,sep='\n'),
color=cluster), alpha=0.3) +
theme_minimal()